#!/usr/bin/env ruby

# Hash to track probes per gene
probesPerGene = Hash.new()

# Create file object to read from
probeDefFile = File.open("./methylationProbeInfo.lff")

# Examine each line of file and process it:
probeDefFile.each_line { |line|
  # If first non-whitespace character on line is a "#",
  # then line is a comment. Skip.
  next if(line =~ /^\s*#/)
  # Remove the last character of the line, but ONLY if it is a newline:
  line.chomp!
  # Extract the gene name from the line (stored in Symbol attribute-vlaue):
  line =~ /Symbol\s*=\s*([^;\n\t]+)/
  # If we matched our pattern, we have a gene name (a symbol)
  if(!$1.nil?)
    # Remove any leading/trailing whitespace around the gene name
    geneName = $1.strip
    # Initialize the value stored in the Hash for this gene name
    # to 0 IF this it is not already in the Hash
    probesPerGene[geneName] = 0 if(!probesPerGene.key?(geneName))
    # Regardless, increment the value stored in the Hash for this gene
    # by one (thereby counting number of probes for that gene)
    probesPerGene[geneName] += 1
  end
}
# Close file
probeDefFile.close

# Output probe count per gene. First, open output file.
outFile = File.open("./probesPerGene.txt", "w+")
# Let's put a column header line in the file:
outFile.puts "#GeneName\tNum Probes"
# For each key in our Hash (i.e. each gene name), print number of
# probes for that gene:
probesPerGene.each_key { |geneName|
  numProbes = probesPerGene[geneName]
  outFile.puts "#{geneName}\t#{numProbes}"
}
# Close output file
outFile.close

# Exit with status 0 (no errors)
exit(0)
